election_data_clean <- election_data |>pivot_longer(cols ="BERDEAK-LOS VERDES":"COALICIÓN POR MELILLA", #Pivoting to make tidy datanames_to ="party",values_to ="votes")election_data_clean<-election_data_clean|>mutate(date_elec =make_date(anno, mes)) |>select(-tipo_eleccion, -codigo_distrito_electoral) |>mutate(votes =ifelse(is.na(votes), 0, votes)) #changing all NA values to 0, as we expect that the data set misinterprets 0s as NA
Abbreviation data
distinct_parties <- election_data_clean |>distinct(party)# Combine datasets as there are parties in election_data not present in abbrev_test <- abbrev |>full_join(distinct_parties, by =c("denominacion"="party")) #Creating a new column with the collected parties abbrev_collect<-abbrev_test |>mutate(party_collect =case_when(str_detect(siglas, "PSOE") |str_detect(siglas, "P.S.O.E") |str_detect(siglas, "psoe") |str_detect(denominacion, "PARTIDO SOCIALISTA") |str_detect(denominacion, "PSOE") |str_detect(denominacion, "P.S.O.E.") ~"PSOE", # PARTIDO SOCIALISTA OBRERO ESPAÑOLstr_detect(denominacion, "CONVERGENCIA I UNIO") ~"CU", # CONVERGÈNCIA I UNIÓstr_detect(siglas, "PP") |str_detect(denominacion, "PARTIDO POPULAR") ~"PP", # PARTIDO POPULARstr_detect(denominacion, "CIUDADANOS-") ~"CS", #CIUDADANOS str_detect(denominacion, "UNIDAS")|str_detect(siglas, "PODEMOS")|str_detect(denominacion, "VERDES")|str_detect(denominacion, "PODEM")|str_detect(denominacion, "EZKER") |str_detect(siglas, "IU")~"UP_IU", #UNIDAS PODEMOS - IU str_detect(denominacion, "BILDU")|str_detect(siglas, "EH")|str_detect(denominacion, "SORTU")|str_detect(denominacion, "ALKARTASUNA")|str_detect(denominacion, "ARALAR") |str_detect(denominacion, "ALTERNATIBA") ~"EH_BILDU", # EH - BILDU str_detect(denominacion, "VASCO") ~"PNV", # PARTIDO NACIONALISTA VASCOstr_detect(siglas, "BNG") |str_detect(denominacion, "GALLEG") |str_detect(denominacion, "GALICIA") |str_detect(denominacion, "GALEGUISTA")|str_detect(denominacion, "GALEG") |str_detect(denominacion, "GALIZ") ~"BNG", # BLOQUE NACIONALISTA GALLEGOstr_detect(siglas, "VOX") ~"VOX", # VOXstr_detect(siglas, "PAÍS") ~"MP", # MÁS PAÍSstr_detect(siglas, "COMPROMÍS") ~"CP", # COMPROMÍSstr_detect(siglas, "ERC") |str_detect(denominacion, "CATALUNYA SÍ") ~"ERC", # ESQUERRA REPUBLICANA DE CATALUNYATRUE~"OTHER"))|>distinct(denominacion, .keep_all =TRUE)
Survey data
survey_clean <- surveys |>filter(year(date_elec) >=2018) |>#Filtering only years >2018filter(exit_poll ==FALSE) |># Only keeping non exit pollsfilter(size >750| size !=NA) |>#Only keeping cases with a size ,larger than 750 or are bot NAmutate(field_days = field_date_to - field_date_from) |>#New column to count the number of field daysrelocate(field_days, .after = field_date_to) |>#Relocating the field_days columnfilter(field_days>0) |>#Filtering cases with field days >0pivot_longer(cols ="UCD":"EV", #Pivoting to make tidy datanames_to ="party",values_to ="vote_int") |>mutate(vote_int =ifelse(is.na(vote_int), 0, vote_int))
Joining tables
Joining election data with abbrev_collect
election_data_clean <- election_data_clean |>left_join(abbrev_collect, by =c("party"="denominacion"))
Joining survey data with abbrev_collect
#Joining the abbrev_collect table with the survey table and adding the abbreviations that are not in the abbrev table to the column: party_collectsurvey_clean_collect<-survey_clean |>left_join(abbrev_collect, by =c("party"="siglas")) |>mutate(party_collect =case_when(party_collect =="PSOE"~"PSOE", party_collect =="CS"~"CS", party_collect =="UP_IU"~"UP_IU", party_collect =="EH_BILDU"~"EH_BILDU", party_collect =="PP"~"PP", party_collect =="PNV"~"PNV", party_collect =="BNG"~"BNG", party_collect =="VOX"~"VOX", party_collect =="MP"~"MP", party_collect =="CP"~"CP", party_collect =="ERC"~"ERC", party_collect =="CU"~"CU",TRUE~ party)) |>select(-party)#Reducing the data set by taking out unsuseful variablessurvey_clean_collect_red <- survey_clean_collect |>select(-field_date_from, -field_date_to, -exit_poll) |>drop_na(vote_int)# removing siglas not to confuse with party_collectabbrev_collect<- abbrev_collect |>select(-siglas)
How is the vote of national parties (PSOE, PP, VOX, CS, MP, UP - IU) distributed against regional or nationalist parties?
national_parties <-c("PSOE", "PP", "VOX", "CS", "MP", "UP_IU") regional_nationalist_parties <-c("OTHER", "ERC", "CP", "BNG", "CU", "PNV", "EH_BILDU" )#data set I need for my questionelection_data_clean_1 <- election_data_clean |>select(-(anno:votos_candidaturas)) #slicing the sample to make the code and then will run it when its all done#election_data_sample <- election_data_clean_1 |> #slice_sample(prop =0.10)election_data_q1 <- election_data_clean_1 |>group_by(date_elec, party_collect) |>mutate(party_type =case_when(party_collect %in%c("PSOE", "PP", "VOX", "CS", "MP", "UP_IU") ~"national",TRUE~"regional/nationalist")) |>mutate(sum_votes =sum(votes)) |>ungroup() #Summing the total votes per party per electionelection_data_q1 <- election_data_q1 |>group_by(date_elec, party_type) |>mutate(votes_elec_party =sum(votes)) |>ungroup() #Summing the total votes voted per electionelection_data_q1 <- election_data_q1 |>group_by(date_elec) |>mutate(votes_elec_total =sum(votes)) |>ungroup()#Computing a percentage of how many votes a party received compared to the total votes votedelection_data_q1 <- election_data_q1 |>mutate(votes_elec_party_perc = (votes_elec_party/votes_elec_total)*100)#creating new dataset to visualize the relationship between the share of national parties and regional/ nationalist partiesgraph_data <- election_data_q1 |>group_by(date_elec) |>distinct(party_type, votes_elec_party_perc)
Q1 Visualisation
p1<-ggplot(data = graph_data, aes(x = date_elec, y = votes_elec_party_perc, color = party_type, group = party_type)) +geom_line() +geom_text(aes(label =sprintf("%.1f%%", votes_elec_party_perc)), vjust =-0.5, hjust =0.5, size =3, color ="black") +labs(title ="Vote Share of National and Regional Parties\nin Elections between 2008 and 2019",y ="",x ="Date of Election") +scale_y_continuous(breaks =seq(0, 100, by =10))+scale_x_date(date_breaks ="1 year", date_labels ="%Y")+theme_classic() +theme(legend.position ="none")
Q1 Visualisation
Q2
Which party was the winner in the municipalities with more than 100,000 habitants (census) in each of the elections?
election_data_clean_2 <-election_data_clean |>select(-(anno:vuelta)) |>mutate(codigo_ccaa_pro_muni =paste(codigo_ccaa, codigo_provincia, codigo_municipio, sep ="-")) |>#code to track each by municipalitymutate(total_votes_municipality = votos_blancos+ votos_nulos + votos_candidaturas) |>#total votes casted in each municipality for every rowgroup_by(date_elec, codigo_ccaa_pro_muni) |>mutate(votes_perc_party_municipality = (votes/total_votes_municipality)*100) |>#percentage of votes that a party got from the total amount of votes cast in each municipalityungroup() q2_final_result <- election_data_clean_2 |>filter(censo >100000) |>group_by(date_elec, codigo_ccaa_pro_muni) |>arrange(desc(votes_perc_party_municipality)) |>slice(1) |>ungroup()
Q2 Visualisation
p2 <-ggplot(q2_final_result, aes(x =party_collect))+geom_bar(stat ="count", fill ="black") +geom_text(stat ="count", aes(label =after_stat(count)), vjust =-0.5, size =3) +labs(title ="Election Winners in Municipalities with more than 100,000 inhabitants in each election", x ="Party", y ="Count") +guides(fill =guide_legend(title ="Party")) +theme_classic() +facet_wrap(~date_elec, scales ="free_x")
Q2 Visualisation
Q3
Which party was the second when the first was the PSOE? And when the first was the PP?
election_data_q3 <- election_data_clean_2 |>left_join(cod_mun, by =c("codigo_ccaa_pro_muni"="cod_mun")) |>filter(censo >100000) |>select(date_elec, party, party_collect, codigo_ccaa_pro_muni, municipio, votes_perc_party_municipality) |>group_by(date_elec, codigo_ccaa_pro_muni) |>arrange(desc(votes_perc_party_municipality)) |>mutate(rank =row_number()) |>ungroup() # Identify dates and municipalities where PSOE wonpsoe_wins <- election_data_q3 |>filter(party_collect =="PSOE", rank ==1) |>select(date_elec, codigo_ccaa_pro_muni)# Filter for municipalities where PSOE won and find the parties coming in second placeparties_2nd_place_after_PSOE <- election_data_q3 |>semi_join(psoe_wins, by =c("date_elec", "codigo_ccaa_pro_muni")) |>filter(rank ==2) |>select(date_elec, municipio, party, party_collect) # Identify dates and municipalities where PP wonPP_wins <- election_data_q3 |>filter(party_collect =="PP", rank ==1) |>select(date_elec, codigo_ccaa_pro_muni)# Filter for municipalities where PP won and find the parties coming in second placeparties_2nd_place_after_PP <- election_data_q3 |>semi_join(PP_wins, by =c("date_elec", "codigo_ccaa_pro_muni")) |>filter(rank ==2) |>select(date_elec, municipio, party, party_collect)
Q3 Visualisation I
This barplot illustrates the second place finisher for municipalities where the PSOE won.
p3 <-ggplot(parties_2nd_place_after_PSOE, aes(x =party_collect))+geom_bar(stat ="count", fill ="black") +geom_text(stat ="count", aes(label =after_stat(count)), vjust =-0.5, size =3) +labs(title ="Second placed parties if PSOE wins a municipality", x ="Party", y ="Count") +guides(fill =guide_legend(title ="Party")) +theme_classic()
Q3 Visualisation I
Q3 Visualisation II
This barplot illustrates the second place finisher for municipalities where the PP won.
p4 <-ggplot(parties_2nd_place_after_PP, aes(x =party_collect))+geom_bar(stat ="count", fill ="black") +geom_text(stat ="count", aes(label =after_stat(count)), vjust =-0.5, size =3) +labs(title ="Second placed parties if PP wins a municipality", x ="Party", y ="Count") +guides(fill =guide_legend(title ="Party")) +theme_classic()
Q3 Visualisation II
Q3 Answer
We now only read out the party that has come in second the most often when the PSOE has won using the glue package.
The party that has come in 2nd most often when the PSOE has won a municipality is PP.
The party that has come in 2nd most often when the PP has won a municipality is PSOE.
Historically, since 2008, in low turnout elections, smaller parties have had a higher chance of winning the election.
p5 <-ggplot(low_turnout_winners, aes(x = party_collect)) +geom_bar(stat ="count") +geom_text(stat ="count", aes(label =after_stat(count)), vjust =-0.5, size =3) +labs(title ="Number of Municipalities Won by Party in Low Turnout Elections",caption ="Low turnout defined as any election with a turnout less than 60% ",x ="Party",y ="") +scale_y_continuous(breaks =seq(0, 55000, by =10000)) +theme_classic()
Q4 Visualisation
Q5
How to analyze the relationship between census and vote? Is it true that certain parties win in rural areas?
#Subset with rural municipalities and their winnerselection_winners_rural<- election_data_clean |>filter(censo <30000, votes >0) |>mutate(municipio_id =paste(codigo_provincia, codigo_municipio, sep ="-")) |>group_by(date_elec, municipio_id) |>mutate(vote_share = (votes / (votos_candidaturas + votos_blancos + votos_nulos))*100) |>ungroup() |> dplyr::slice_max(n =1, vote_share, by =c(municipio_id)) |>select(vote_share, party, party_collect)
Q5 Visualisation
As may be expected, the PP is most successful in elections in rural areas.
#Plot the count of each party winning rural areasp6 <-ggplot(election_winners_rural, aes(x = party_collect)) +geom_bar(stat ="count") +geom_text(stat ="count", aes(label =after_stat(count)), vjust =-0.5, size =3) +labs(title ="Winning Parties in Rural Areas",caption ="Rural area defined as municipality with less than 30,000 inhabitants",x ="Party",y ="Count")+theme_classic()
Q5 Visualisation
Q6
How to calibrate the error of the polls (remember that the polls are voting intentions at national level)?
election_data_clean_red <- election_data_clean |>select(-anno, -mes, -vuelta,-participacion_1, -participacion_2, -votos_candidaturas) |>mutate(codigo_ccaa_pro_muni =paste(codigo_ccaa, codigo_provincia, codigo_municipio, sep ="-")) |>filter(year(date_elec) >=2018)#Summing the total votes per party per electionelection_data_clean_red<-election_data_clean_red |>group_by(date_elec, party_collect) |>mutate(votes_elec_party =sum(votes)) |>ungroup() #Summing the total votes voted per election election_data_clean_red<-election_data_clean_red |>group_by(codigo_ccaa_pro_muni, date_elec) |>mutate(votes_total_blancos_nulos = votos_blancos + votos_nulos) |>ungroup() |>group_by(date_elec) |>mutate(votes_total_blancos_nulos =sum (votes_total_blancos_nulos)) |>mutate(votes_elec =sum(votes)) |>ungroup() |>mutate(votes_elec_total = votes_total_blancos_nulos + votes_elec)#Computing a percentage of how many votes a party received compared to the total votes votedelection_data_clean_red<-election_data_clean_red |>mutate(votes_elec_party_perc = (votes_elec_party/votes_elec_total)*100)election_data_clean_red_7 <- election_data_clean_red |>select(id_join, votes_elec_total, codigo_ccaa, date_elec, votes_elec_party_perc, party_collect) |>distinct(id_join, date_elec, .keep_all =TRUE)survey_clean_collect_red_7 <- survey_clean_collect_red |>select(id_join, vote_int, id_pollster)# Joining survey_clean with election_data_clean using the id_join from both tablessurvey_elec <- election_data_clean_red_7 |>left_join(survey_clean_collect_red_7, by =c("id_join"="id_join"), relationship ="many-to-many") #subtracting the voting intentions (in percentage) of the percenatage of votes gained per per party per election. #As the difference can be both positive and negative, the differences in votes are squared to account for the negativessurvey_elec<-survey_elec |>mutate(diff_vote_int = votes_elec_party_perc - vote_int) |>mutate(diff_vote_int_2 =diff_vote_int^2) #Creating the average difference between percentage of votes gained and voting intentions per election#Taking the squareroot to return the variable in to the orginal unitssurvey_elec<-survey_elec |>group_by(date_elec) |>mutate(avg_diff_vote_int =sqrt(mean(diff_vote_int_2, na.rm =TRUE))) |>ungroup()
Q7
In which election were the polls most wrong?
#Checking in which election the polls were most wrong survey_elec_result<- survey_elec |>distinct(avg_diff_vote_int, date_elec)survey_elec_result
p7 <-ggplot(survey_elec_result, aes(x = date_elec_factor, y = avg_diff_vote_int)) +geom_col() +labs(title ="Error in election polls per election", x ="Election", y ="Average Difference votes and votes intentions") +theme_minimal()
Q7 Visualisation
Q8
How were the polls wrong in national parties (PSOE, PP, VOX, CS, MP, UP - IU)?
#Computing the average polling error per party per electionsurvey_elec_national<-survey_elec |>filter(party_collect =="PSOE"|party_collect =="PP"|party_collect =="VOX"|party_collect =="CS "|party_collect =="MP"|party_collect =="UP_IU") |>group_by(date_elec, party_collect) |>mutate(avg_diff_poll =sqrt(mean(diff_vote_int_2, na.rm =TRUE)) ) |>ungroup()
Q8 II
#Obtaining the resultssurvey_elec_national |>distinct(avg_diff_poll, date_elec, party_collect) |>arrange(date_elec)
p8 <-ggplot(survey_elec_national, aes(x = party_collect, y = avg_diff_poll, fill = date_elec_factor)) +geom_col(position ="dodge") +labs(title ="Error in election polls per party", x ="Party", y ="Average Difference votes and votes intentions",fill ="Election") +theme_minimal()
Q8 Visualisation
Q9
Which polling houses got it right the most and which ones deviated the most from the results?
#Computing the average polling error per polling house per election and obtaining the minimum and maximum polling with their corresponding pollhouses per electionsurvey_elec<-survey_elec |>group_by(id_pollster, date_elec) |>mutate(avg_diff_pollhouse =sqrt(mean(diff_vote_int_2, na.rm =TRUE))) |>ungroup() |>drop_na(avg_diff_pollhouse)#Obtaining the result for the maximum polling errorresult_max<-survey_elec|>group_by(date_elec) |>filter(avg_diff_pollhouse ==max(avg_diff_pollhouse)) |>distinct(avg_diff_pollhouse, id_pollster, date_elec)
#Obtaining the result for the minimum polling errorresult_min<-survey_elec|>group_by(date_elec) |>filter(avg_diff_pollhouse ==min(avg_diff_pollhouse)) |>distinct(avg_diff_pollhouse, id_pollster, date_elec) result_min
# ordering codigo variable for subsequent plotmax_turnout_mes04 <- election_data_clean_lollipop |>filter(mes =="04") |>group_by(codigo_ccaa) |>summarize(max_turnout =max(voter_turnout))# Order codigo_ccaa based on max_turnout_mes11order_by_max_turnout <- max_turnout_mes04 |>arrange(max_turnout) |>pull(codigo_ccaa)# Apply the order to the codigo_ccaa factorelection_data_clean_lollipop$codigo_ccaa <-factor( election_data_clean_lollipop$codigo_ccaa,levels = order_by_max_turnout)
p13 <-newggslopegraph(dataframe = election_data_mika,Times = convocatorias,Measurement = vote_share,Grouping = party_collect,Title ="Vote Distribution",SubTitle ="Percentage share of votes per party for elections from 2008 to 2019",LineColor = party_colors)